import subprocess
import os

model_name = "hf-models/glm-4-9b-chat"


api_server_command = [
    "python",
    "-m",
    "vllm.entrypoints.openai.api_server",
    "--model",
    model_name,
    "--dtype",
    "float16",
    "--api-key",
    "",
    "--tensor-parallel-size",
    "1",
    "--trust-remote-code",
    "--gpu-memory-utilization",
    "0.8",
    "--max-num-batched-tokens",
    "10000",
    "--max-model-len",
    "10000",
    "--port",
    "7860",
    # "--enforce-eager"
]
api_process = subprocess.Popen(
    api_server_command, text=True)
print("开始启动 api 服务")

try:
    api_process.wait()
finally:
    api_process.kill()
    print("Servers shut down.")